train_data <- merge(traindemographics, trainperf, all.y = T, by = "customerid")
test_data <- merge(testdemographics, testperf, all.y = T, by = "customerid")
loan_data <- rbind(train_data[, set := "train"], test_data[, set := "test"], fill = T)
dates <- c("birthdate" ,"approveddate", "creationdate" )
loan_data[, (dates) := lapply(.SD, as.Date), .SDcols = dates]
loan_data[, age := (as.numeric(approveddate - birthdate))/365]
loan_data[, aprove_month := month(approveddate)]
loan_data[, approve_day := wday(approveddate)]
loan_data[, approve_year := year(approveddate)]
loan_data[!is.na(good_bad_flag), .N, by = .(good_bad_flag)] %>%
.[, perc := round(N/sum(N) * 100, 2)] %>%
ggplot(aes(good_bad_flag, perc, fill =good_bad_flag)) +
geom_bar(stat = "identity") +
geom_text(aes(good_bad_flag, perc, label = paste(perc, "%"),
vjust = .05, hjust = .5),
size = 4)+
theme_hc()+
labs(title = "Percentage of bad loans")+
scale_fill_colorblind(name = "")+
theme(legend.position = "none")
ggplot(missing_perc, aes( reorder(variables, perc), perc))+
geom_bar(stat = "identity") +
theme_fivethirtyeight()+
coord_flip()

loan_data[, loannumber := as.numeric(loannumber)]
missing_var_del <- missing_perc[perc>50, variables]
## KNN imputation
library(VIM)
loan_data[, (dates):= NULL]
loan_data[, referredby:= NULL]
loan_data <- kNN(loan_data,useImputedDist = FALSE, k =10)
setDT(loan_data)
nms_all <- names(loan_data)
nms_imp <- nms_all[grepl("_imp$", nms_all)]
loan_data[, (nms_imp) := lapply(.SD,
function(x) ifelse(x == FALSE, 0, 1)),
.SDcols = nms_imp]
col_sum_imp <- loan_data[, colSums(.SD), .SDcols = nms_imp]
col_sum_imp <- names(col_sum_imp[col_sum_imp == 0])
#var_importants <- fread("var_importanta.csv")
loan_data[, (col_sum_imp) := NULL]
loan_data[, good_bad_flag := factor(good_bad_flag, levels = c("Bad", "Good"))]
nms_del1 <- c("set_imp", " good_bad_flag_imp",
"approve_year","aprove_month",
"year","systemloanid" )
loan_data[, (nms_del1) := NULL]
class_nms <- sapply(loan_data, class)
nums <- class_nms[class_nms == "numeric"] %>% names()
nums <- nums[!grepl("_imp|good_bad_flag", nums)]
zero_one <- function(x){
myvar <- (x - min(x))/(max(x) - min(x))
myvar
}
loan_data[, (nums) := lapply(.SD, zero_one), .SDcols = nums]
train_data <- loan_data[set == "train"]
train_data[, set:= NULL]
test_data <- loan_data[set == "test"]
test_data[, set:= NULL]
train_bad <- train_data[good_bad_flag == "Bad"]
train_good <- train_data[good_bad_flag == "Good"]
n_row = nrow(train_good)
n_row_dead = nrow(train_bad)
set.seed(200)
not_sample <- sample(1:n_row, n_row_dead)
train_good <- train_good[not_sample]
train_sampled <- rbind(train_bad, train_good)
## Model Cross validation
set.seed(100)
cv_fold <- createFolds(train_sampled$good_bad_flag, k = 10)
train_ctrl <- trainControl(method = "cv",
number = 10,
summaryFunction = twoClassSummary,
classProbs = TRUE,
allowParallel=T,
index = cv_fold,
verboseIter = FALSE,
savePredictions = TRUE,
search = "grid")
xgb_grid <- expand.grid(nrounds = c(50,100),
eta = 0.4,
max_depth = c(2,3),
gamma = c(0, .01),
colsample_bytree = c(0.6, .8, 1),
min_child_weight = 1,
subsample = c(.5, .8, 1))
ranger_grid <- expand.grid(splitrule = c("extratrees", "gini"),
mtry = c(10, 20, (ncol(train_data) - 2) ),
min.node.size = c(1, 5))
svm_grid <- expand.grid(C = c( 1, 3, 5, 20),
sigma = seq(0.001, 0.524 , length.out = 7))
library(caret)
library(caretEnsemble)
library(tictoc)
#tuneGrid= xgb_grid
tic()
model_list <- caretList(
good_bad_flag~.,
data=train_sampled[, .SD, .SDcols = !"customerid"],
metric = "ROC",
trControl=train_ctrl,
tuneList = list(caretModelSpec(method="xgbTree",tuneGrid= xgb_grid ),
caretModelSpec(method = "svmRadial", tuneGrid = svm_grid),
caretModelSpec(method="ranger", tuneGrid= ranger_grid)
)
)
toc()
## 165.1 sec elapsed
## $xgbTree
## eXtreme Gradient Boosting
##
## 1906 samples
## 25 predictor
## 2 classes: 'Bad', 'Good'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 190, 191, 191, 190, 190, 191, ...
## Resampling results across tuning parameters:
##
## max_depth gamma colsample_bytree subsample nrounds ROC Sens
## 2 0.00 0.6 0.5 50 0.6675440 0.6278507
## 2 0.00 0.6 0.5 100 0.6597599 0.6304177
## 2 0.00 0.6 0.8 50 0.7217087 0.6621213
## 2 0.00 0.6 0.8 100 0.7043960 0.6627110
## 2 0.00 0.6 1.0 50 0.7397850 0.6685301
## 2 0.00 0.6 1.0 100 0.7215911 0.6707503
## 2 0.00 0.8 0.5 50 0.6585361 0.6123478
## 2 0.00 0.8 0.5 100 0.6525907 0.6214439
## 2 0.00 0.8 0.8 50 0.7186698 0.6599123
## 2 0.00 0.8 0.8 100 0.6986109 0.6564184
## 2 0.00 0.8 1.0 50 0.7408503 0.6595522
## 2 0.00 0.8 1.0 100 0.7235121 0.6673740
## 2 0.00 1.0 0.5 50 0.6664863 0.6193329
## 2 0.00 1.0 0.5 100 0.6570109 0.6271548
## 2 0.00 1.0 0.8 50 0.7107864 0.6501159
## 2 0.00 1.0 0.8 100 0.6971123 0.6602676
## 2 0.00 1.0 1.0 50 0.7367861 0.6664325
## 2 0.00 1.0 1.0 100 0.7189294 0.6642244
## 2 0.01 0.6 0.5 50 0.6587045 0.6265695
## 2 0.01 0.6 0.5 100 0.6575760 0.6348562
## 2 0.01 0.6 0.8 50 0.7198368 0.6625938
## 2 0.01 0.6 0.8 100 0.7049304 0.6590954
## 2 0.01 0.6 1.0 50 0.7421705 0.6600194
## 2 0.01 0.6 1.0 100 0.7245672 0.6678356
## 2 0.01 0.8 0.5 50 0.6594476 0.6091857
## 2 0.01 0.8 0.5 100 0.6566138 0.6249385
## 2 0.01 0.8 0.8 50 0.7179802 0.6610740
## 2 0.01 0.8 0.8 100 0.7015752 0.6516354
## 2 0.01 0.8 1.0 50 0.7413740 0.6526739
## 2 0.01 0.8 1.0 100 0.7221938 0.6643373
## 2 0.01 1.0 0.5 50 0.6542567 0.6133859
## 2 0.01 1.0 0.5 100 0.6567132 0.6263353
## 2 0.01 1.0 0.8 50 0.7143992 0.6636411
## 2 0.01 1.0 0.8 100 0.6994088 0.6540859
## 2 0.01 1.0 1.0 50 0.7367861 0.6664325
## 2 0.01 1.0 1.0 100 0.7178856 0.6650403
## 3 0.00 0.6 0.5 50 0.6521986 0.6110577
## 3 0.00 0.6 0.5 100 0.6516902 0.6220238
## 3 0.00 0.6 0.8 50 0.7066189 0.6566458
## 3 0.00 0.6 0.8 100 0.6907217 0.6482569
## 3 0.00 0.6 1.0 50 0.7241909 0.6659743
## 3 0.00 0.6 1.0 100 0.7048806 0.6622484
## 3 0.00 0.8 0.5 50 0.6533295 0.6257437
## 3 0.00 0.8 0.5 100 0.6496811 0.6112926
## 3 0.00 0.8 0.8 50 0.7000046 0.6522196
## 3 0.00 0.8 0.8 100 0.6867139 0.6448741
## 3 0.00 0.8 1.0 50 0.7187715 0.6624766
## 3 0.00 0.8 1.0 100 0.7027482 0.6592190
## 3 0.00 1.0 0.5 50 0.6520713 0.6184046
## 3 0.00 1.0 0.5 100 0.6429416 0.6128115
## 3 0.00 1.0 0.8 50 0.6927271 0.6428846
## 3 0.00 1.0 0.8 100 0.6814026 0.6448708
## 3 0.00 1.0 1.0 50 0.7183425 0.6635239
## 3 0.00 1.0 1.0 100 0.7000319 0.6558334
## 3 0.01 0.6 0.5 50 0.6635055 0.6273885
## 3 0.01 0.6 0.5 100 0.6530193 0.6209713
## 3 0.01 0.6 0.8 50 0.6986443 0.6549018
## 3 0.01 0.6 0.8 100 0.6855622 0.6474383
## 3 0.01 0.6 1.0 50 0.7224444 0.6648053
## 3 0.01 0.6 1.0 100 0.7060696 0.6599160
## 3 0.01 0.8 0.5 50 0.6599489 0.6207412
## 3 0.01 0.8 0.5 100 0.6518656 0.6219056
## 3 0.01 0.8 0.8 50 0.6984896 0.6516337
## 3 0.01 0.8 0.8 100 0.6849219 0.6423148
## 3 0.01 0.8 1.0 50 0.7194476 0.6632932
## 3 0.01 0.8 1.0 100 0.7032396 0.6542018
## 3 0.01 1.0 0.5 50 0.6571391 0.6179347
## 3 0.01 1.0 0.5 100 0.6494040 0.6186328
## 3 0.01 1.0 0.8 50 0.6957470 0.6474355
## 3 0.01 1.0 0.8 100 0.6831119 0.6391576
## 3 0.01 1.0 1.0 50 0.7173461 0.6650392
## 3 0.01 1.0 1.0 100 0.6990038 0.6590969
## Spec
## 0.6188807
## 0.6012625
## 0.6606161
## 0.6384609
## 0.6879020
## 0.6516350
## 0.6129367
## 0.5996342
## 0.6646959
## 0.6300672
## 0.6967615
## 0.6592170
## 0.6160751
## 0.5952051
## 0.6516380
## 0.6188798
## 0.6845206
## 0.6532707
## 0.5998692
## 0.5961439
## 0.6601507
## 0.6428915
## 0.6952461
## 0.6611943
## 0.6140933
## 0.5989399
## 0.6538544
## 0.6426639
## 0.7032929
## 0.6570017
## 0.6124558
## 0.5922880
## 0.6507087
## 0.6384675
## 0.6845206
## 0.6481425
## 0.6037164
## 0.5903130
## 0.6449907
## 0.6258717
## 0.6616637
## 0.6388155
## 0.5921684
## 0.5964857
## 0.6354301
## 0.6236545
## 0.6512880
## 0.6343834
## 0.5998734
## 0.5877549
## 0.6314659
## 0.6157237
## 0.6584013
## 0.6319372
## 0.6109569
## 0.5966067
## 0.6305436
## 0.6178262
## 0.6574691
## 0.6367166
## 0.6029056
## 0.5904248
## 0.6322864
## 0.6168991
## 0.6573545
## 0.6389298
## 0.6046464
## 0.5959014
## 0.6361284
## 0.6238861
## 0.6559509
## 0.6291390
##
## Tuning parameter 'eta' was held constant at a value of 0.4
## Tuning
## parameter 'min_child_weight' was held constant at a value of 1
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were nrounds = 50, max_depth = 2, eta
## = 0.4, gamma = 0.01, colsample_bytree = 0.6, min_child_weight = 1
## and subsample = 1.
##
## $svmRadial
## Support Vector Machines with Radial Basis Function Kernel
##
## 1906 samples
## 25 predictor
## 2 classes: 'Bad', 'Good'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 190, 191, 191, 190, 190, 191, ...
## Resampling results across tuning parameters:
##
## C sigma ROC Sens Spec
## 1 0.00100000 0.4412492 0.6618623 0.3382454
## 1 0.08816667 0.7063337 0.6683058 0.6214489
## 1 0.17533333 0.7256582 0.6664370 0.6602756
## 1 0.26250000 0.6919439 0.6315584 0.6479259
## 1 0.34966667 0.7220906 0.6364704 0.6830113
## 1 0.43683333 0.7185227 0.6428808 0.6727497
## 1 0.52400000 0.7147828 0.6288937 0.6889579
## 3 0.00100000 0.5163466 0.4034898 0.6034076
## 3 0.08816667 0.7503086 0.6594411 0.7144810
## 3 0.17533333 0.7423805 0.6590902 0.7084203
## 3 0.26250000 0.7341790 0.6597951 0.6972255
## 3 0.34966667 0.7283172 0.6511635 0.6996721
## 3 0.43683333 0.7235423 0.6539646 0.6892970
## 3 0.52400000 0.7193765 0.6398573 0.6978099
## 5 0.00100000 0.5098943 0.3711403 0.6181310
## 5 0.08816667 0.7518890 0.6658526 0.7127298
## 5 0.17533333 0.7392287 0.6663227 0.6972262
## 5 0.26250000 0.7330108 0.6696994 0.6894161
## 5 0.34966667 0.7280544 0.6606077 0.6890720
## 5 0.43683333 0.7234473 0.6518581 0.6866204
## 5 0.52400000 0.7191750 0.6540742 0.6824247
## 20 0.00100000 0.6383193 0.5461143 0.6220235
## 20 0.08816667 0.7403581 0.6672478 0.6976935
## 20 0.17533333 0.7356718 0.6714449 0.6840560
## 20 0.26250000 0.7299851 0.6692351 0.6732156
## 20 0.34966667 0.7241865 0.6602530 0.6715803
## 20 0.43683333 0.7198412 0.6669151 0.6552680
## 20 0.52400000 0.7157682 0.6442871 0.6783449
##
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.08816667 and C = 5.
##
## $ranger
## Random Forest
##
## 1906 samples
## 25 predictor
## 2 classes: 'Bad', 'Good'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 190, 191, 191, 190, 190, 191, ...
## Resampling results across tuning parameters:
##
## splitrule mtry min.node.size ROC Sens Spec
## extratrees 10 1 0.8049571 0.6909181 0.7667202
## extratrees 10 5 0.8047689 0.6852059 0.7756981
## extratrees 20 1 0.8057977 0.6880028 0.7720858
## extratrees 20 5 0.8060334 0.6829887 0.7811791
## extratrees 25 1 0.8052493 0.6875334 0.7762797
## extratrees 25 5 0.8045445 0.6806559 0.7821136
## gini 10 1 0.7769078 0.6985025 0.7100580
## gini 10 5 0.7781485 0.6940717 0.7176355
## gini 20 1 0.7618497 0.6818278 0.7045735
## gini 20 5 0.7651843 0.6825285 0.7049258
## gini 25 1 0.7603744 0.6791466 0.7015445
## gini 25 5 0.7591756 0.6797294 0.7008435
##
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were mtry = 20, splitrule = extratrees
## and min.node.size = 5.
##
## attr(,"class")
## [1] "caretList"
resamples_models <- resamples(model_list)
dotplot(resamples_models, metric = "Sens")
